Per session and per user analysis

Analysis of users.

Table of Contents

  1. Preparation

  2. Constants

  3. Functions

Preparation


In [ ]:
%run "../Functions/1. Google form analysis.ipynb"

Constants


In [ ]:
perSessionRelevantColumns = ['sessionId', 'serverTime', 'section']

#reachEvents = rmdf152[rmdf152['type']=='reach'].loc[:,perSessionRelevantColumns]
#deathEvents = rmdf152[rmdf152['type']=='death'].loc[:,perSessionRelevantColumns]

timedSectionsIndex = [
                        'tutorial1.Checkpoint00',
                        'tutorial1.Checkpoint01',
                        'tutorial1.Checkpoint02',
                        'tutorial1.Checkpoint03',
                        'tutorial1.Checkpoint04',
                        'tutorial1.Checkpoint05',
                        'tutorial1.Checkpoint06',
                        'tutorial1.Checkpoint07',
                        'tutorial1.Checkpoint08',
                        'tutorial1.Checkpoint09',
                        'tutorial1.Checkpoint10',
                        'tutorial1.Checkpoint11',
                        'tutorial1.Checkpoint12',
                        'tutorial1.Checkpoint13',
                        'tutorial1.Checkpoint14',
                     ]
timedSectionsReachedColumns = ['firstReached', 'firstCompletionDuration']
timedSectionsDeathsColumns = ['deathsCount']
eventSectionsCountColumns = ['section', 'count']
eventSectionsColumns = ['count']

Functions


In [ ]:
## Comparison between game and Google form performance

In [ ]:
# Returns a given session's checkpoints, the first server time at which they were reached, and completion time
def getCheckpointsTimes( sessionId, _rmDF = rmdf152 ):
    reachEvents = _rmDF[_rmDF['type']=='reach'].loc[:,perSessionRelevantColumns]
    perSession = reachEvents[reachEvents['sessionId']==sessionId]
    perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]

    timedSections = pd.DataFrame(data=0, columns=timedSectionsReachedColumns,index=timedSectionsIndex)
    timedSections['firstReached'] = pd.Timestamp(0, tz='utc')
    timedSections['firstCompletionDuration'] = pd.Timedelta.max

    if(len(perSession) > 0):
        timedSections["firstReached"] = perSession.groupby("section").agg({ "serverTime": np.min })
        timedSections["firstCompletionDuration"] = timedSections["firstReached"].diff()

        if(timedSections.loc["tutorial1.Checkpoint00","firstReached"] != pd.Timestamp(0, tz='utc')):
            timedSections.loc["tutorial1.Checkpoint00","firstCompletionDuration"] = \
            pd.Timedelta(0)

    timedSections["firstReached"] = timedSections["firstReached"].fillna(pd.Timestamp(0, tz='utc'))
    timedSections["firstCompletionDuration"] = timedSections["firstCompletionDuration"].fillna(pd.Timedelta.max)
    
    return timedSections

In [ ]:
# Returns a given user's checkpoints, the first server time at which they were reached, and completion time
def getCheckpointsTimesUser( userId, _sessionsList = [], _rmDF = rmdf152 ):
    # List of associated sessions
    if( len(_sessionsList) == 0):
        _sessionsList = getUserSessions(_rmDF, userId)

    # Call getCheckpointsTimes on all sessions associated with user,
    # then merge by taking oldest checkpoint completion
    _timedSections = pd.DataFrame(data=0, columns=timedSectionsReachedColumns,index=timedSectionsIndex)
    _timedSections["firstReached"] = pd.Timestamp(0, tz='utc')
    _timedSections["firstCompletionDuration"] = pd.Timedelta.max

    # merge
    # for each checkpoint reached, update if necessary
    for _sessionId in _sessionsList:        
        _thisSessionTimes = getCheckpointsTimes( _sessionId )

        for _checkpointName in _thisSessionTimes.index:
            if ((_thisSessionTimes.loc[_checkpointName, 'firstReached'] != pd.Timestamp(0, tz='utc'))
                and
                ((_timedSections.loc[_checkpointName, 'firstReached'] == pd.Timestamp(0, tz='utc'))
                or (_timedSections.loc[_checkpointName, 'firstReached'] > _thisSessionTimes.loc[_checkpointName, 'firstReached']))
               ):
                _timedSections.loc[_checkpointName, 'firstReached'] = _thisSessionTimes.loc[_checkpointName, 'firstReached']
                _timedSections.loc[_checkpointName, 'firstCompletionDuration'] = _thisSessionTimes.loc[_checkpointName, 'firstCompletionDuration']

    return _timedSections

In [ ]:
def getPlayedTimeSessionMode(sessionEvents, mode):
    sessionTimes = sessionEvents[sessionEvents['section'].str.startswith(mode, na=False)]['userTime']
    sessionTimes.index = sessionTimes.values
    
    daysSpent = set()
    totalSpentTime = pd.Timedelta(0)
    
    if(len(sessionTimes) > 0):
        sessionTimes = sessionTimes.groupby(pd.TimeGrouper('D')).agg({ "start": np.min, "end": np.max })

        daysSpent = set(sessionTimes.index)

        sessionTimes['played'] = sessionTimes['end'] - sessionTimes['start']
        totalSpentTime = sessionTimes['played'].sum()

    return {'daysSpent': daysSpent, 'totalSpentTime': totalSpentTime}

In [ ]:
# Returns a given session's total playtime and day count
def getPlayedTimeSession( sessionId, _rmDF = rmdf152 ):
    sessionEvents = _rmDF[_rmDF['sessionId']==sessionId]
    tutorialTime = getPlayedTimeSessionMode(sessionEvents, 'tutorial')
    sandboxTime = getPlayedTimeSessionMode(sessionEvents, 'sandbox')
    return {'tutorial': tutorialTime, 'sandbox': sandboxTime}

In [ ]:
def mergePlayedTimes(a, b):
    result = a.copy()
    for gameMode in a:
        result[gameMode] = {
            'totalSpentTime': a[gameMode]['totalSpentTime'] + b[gameMode]['totalSpentTime'],
            'daysSpent': a[gameMode]['daysSpent'] | b[gameMode]['daysSpent'],
        }
    return result

In [ ]:
# Returns a given user's total playtime and day count
def getPlayedTimeUser( userId, _sessionsList = [], _rmDF = rmdf152 ):
    result = getPlayedTimeSession('', _rmDF = _rmDF)

    if(len(_sessionsList) == 0):
        _sessionsList = getUserSessions(_rmDF, userId)
    for session in _sessionsList:
        playedTimes = getPlayedTimeSession(session, _rmDF)
        result = mergePlayedTimes(result, playedTimes)

    return result

In [ ]:
# Returns a given session's checkpoints, and death count
def getDeaths( sessionId, _rmDF = rmdf152 ):
    deathEvents = _rmDF[_rmDF['type']=='death'].loc[:,perSessionRelevantColumns]
    perSession = deathEvents[deathEvents['sessionId']==sessionId]
    perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
    deathsSections = perSession.groupby("section").size().reset_index(name='deathsCount')
    return deathsSections

In [ ]:
def getDeathsUser( userId, _rmDF = rmdf152 ):
    #print("getDeathsUser(" + str(userId) + ")")
    
    # List of associated sessions
    sessionsList = getUserSessions(_rmDF, userId)
    #print("sessionsList=" + str(sessionsList))
    
    # Call getDeaths on all sessions associated with user,
    # then merge by adding
    deathsSections = pd.DataFrame(0, columns=timedSectionsDeathsColumns,index=timedSectionsIndex)
    
    for sessionId in sessionsList:        
        #print("processing user " + str(userId) + " with session " + str(sessionId))
        deaths = getDeaths( sessionId )
        
        # merge
        # for each checkpoint reached, update if necessary
        for index in deaths.index:
            #print("index=" + str(index))
            checkpointName = deaths['section'][index]
            #print("checkpointName=" + str(checkpointName))
            #print("deaths['deathsCount']["+str(index)+"]=" + str(deaths['deathsCount'][index]))
            
            deathsSections['deathsCount'][checkpointName] = deathsSections['deathsCount'][checkpointName] + deaths['deathsCount'][index]
    
    return deathsSections

Craft events: equip, unequip, add, remove

event-column association

equip device = 'add' + customData.device

unequip device = 'remove' + customData.device

add brick = 'add' + customData.biobrick

remove brick = 'remove' + customData.biobrick

In [ ]:
# Static data
# craftEventsColumns = pd.DataFrame(
#    index=list(range(4)),
#    data={
#        'eventCode' : pd.Categorical(["equip","unequip","add","remove"]),
#        'eventType' : pd.Categorical(["add","remove","add","remove"]),
#        'column' : pd.Categorical(["customData.device","customData.device","customData.biobrick","customData.biobrick"]),
#    }
#)
#craftEventsColumns

In [ ]:
# Static data
craftEventCodes = list(["equip","unequip","add","remove"])
craftEventsColumns = pd.DataFrame(
    index=craftEventCodes,
    data={
        'eventType' : pd.Categorical(["add","remove","add","remove"]),
        'column' : pd.Categorical(["customData.device","customData.device","customData.biobrick","customData.biobrick"]),
    }
)

In [ ]:
# Returns a given session's checkpoints, and event count
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getSectionsCraftEvents( eventCode, sessionId, _rmDF = rmdf152 ):
    #print("getSectionsCraftEvents(" + str(eventCode) + "," + str(sessionId) + ")")
    sectionsEvents = pd.DataFrame(0, columns=eventSectionsCountColumns, index=range(0))
    if eventCode in craftEventCodes:
        eventType = craftEventsColumns['eventType'][eventCode]
        events = _rmDF[_rmDF['type']==eventType]
        events = events[events[craftEventsColumns['column'][eventCode]].notnull()]
        #print("getSectionsCraftEvents(" + str(eventCode) + "," + str(sessionId) + "): #events=" + str(len(events)))
        #print("events=" + str(events.head()))
        events = events.loc[:,perSessionRelevantColumns]
        perSession = events[events['sessionId']==sessionId]
        perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
        sectionsEvents = perSession.groupby("section").size().reset_index(name='count')
    else:
        print("incorrect event code '" + eventCode + "'")
    return sectionsEvents

In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserSectionsCraftEvents( eventCode, userId, sessionsList = [], _rmDF = rmdf152 ):
    #print("getUserSectionsCraftEvents(" + str(eventCode) + "," + str(userId) + ")")
    
    # Call getSectionsEvents on all sessions associated with user,
    # then merge by adding
    userSectionsEvents = pd.DataFrame(0, columns=eventSectionsColumns,index=timedSectionsIndex)
    
    if eventCode in craftEventCodes:
        # List of associated sessions
        if(len(sessionsList) == 0):
            sessionsList = getUserSessions(_rmDF, userId)
        #print("sessionsList=" + str(sessionsList))
    
        for sessionId in sessionsList:
            sessionSectionsEvents = getSectionsCraftEvents( eventCode, sessionId )

            # merge
            # for each checkpoint reached, update if necessary
            for index in sessionSectionsEvents.index:
                checkpointName = sessionSectionsEvents['section'][index]
                userSectionsEvents['count'][checkpointName] = userSectionsEvents['count'][checkpointName] + sessionSectionsEvents['count'][index]
    else:
        print("incorrect event code '" + eventCode + "'")
    return userSectionsEvents

In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserSectionsCraftEventsTotal( eventCode, userId, sessionsList = [] ):
    #print("getUserSectionsCraftEventsTotal(" + str(eventCode) + "," + str(userId) + ")")
    events = getUserSectionsCraftEvents( eventCode, userId, sessionsList )
    return events.values.sum()

craft, no section


In [ ]:
# eventCode != RedMetrics' eventType
# eventCodes are craftEventsColumns' index
def getUserCraftEventsTotal( eventCode, userId, sessionsList=[], _rmDF = rmdf152 ):
    if(len(sessionsList) == 0):
        sessionsList = getUserSessions(_rmDF, userId)

    if eventCode in craftEventCodes:
        eventType = craftEventsColumns['eventType'][eventCode]
        events = _rmDF[_rmDF['type']==eventType]
        events = events[events[craftEventsColumns['column'][eventCode]].notnull()]
        perSession = events[events['sessionId'].isin(sessionsList)]
        return len(perSession)
    else:
        print("incorrect event code '" + eventCode + "'")
        return 0

Generic functions

Generic count


In [ ]:
# Returns a given session's checkpoints, and event count
def getSectionsEvents( eventType, sessionId, _rmDF = rmdf152 ):
    events = _rmDF[_rmDF['type']==eventType].loc[:,perSessionRelevantColumns]
    perSession = events[events['sessionId']==sessionId]
    perSession = perSession[perSession['section'].str.startswith('tutorial', na=False)]
    sectionsEvents = perSession.groupby("section").size().reset_index(name='count')
    return sectionsEvents

In [ ]:
def getUserSectionsEvents( eventType, userId, sessionsList=[], _rmDF = rmdf152 ):
    
    # List of associated sessions
    if(len(sessionsList) == 0):
        sessionsList = getUserSessions(_rmDF, userId)
    
    # Call getSectionsEvents on all sessions associated with user,
    # then merge by adding
    userSectionsEvents = pd.DataFrame(0, columns=eventSectionsColumns,index=timedSectionsIndex)
    
    for sessionId in sessionsList:
        sessionSectionsEvents = getSectionsEvents( eventType, sessionId )
        
        # merge
        # for each checkpoint reached, update if necessary
        for index in sessionSectionsEvents.index:
            checkpointName = sessionSectionsEvents['section'][index]
            userSectionsEvents['count'][checkpointName] = userSectionsEvents['count'][checkpointName] + sessionSectionsEvents['count'][index]
    
    return userSectionsEvents

In [ ]:
def getUserSectionsEventsTotal( eventType, userId, sessionsList=[] ):
    events = getUserSectionsEvents( eventType, userId, sessionsList )
    return events.values.sum()

No section events


In [ ]:
def getUserEventsTotal( eventType, userId, sessionsList=[], _rmDF = rmdf152 ):
    if(len(sessionsList) == 0):
        sessionsList = getUserSessions(_rmDF, userId)

    sessionEvents = _rmDF[_rmDF['type']==eventType]
    perSession = sessionEvents[sessionEvents['sessionId'].isin(sessionsList)]
    return len(perSession)

Other


In [ ]:
# Returns a given user's unique reached checkpoints
def getUserCheckpoints( userId, _rmDF = rmdf152 ):
    #print("getUserCheckpoints(" + str(userId) + ")")

    # List of associated sessions
    sessionsList = getUserSessions(_rmDF, userId)
    #print("sessionsList=" + str(sessionsList))

    # List all 'reach' events with those sessionIds.
    reachEvents = _rmDF[_rmDF['type']=='reach'].loc[:,perSessionRelevantColumns]
    perUser = reachEvents[reachEvents['sessionId'].isin(sessionsList)]
    perUser = perUser[perUser['section'].str.startswith('tutorial', na=False)]
    return pd.Series(perUser['section'].unique())


def getDiscrepancyGameGForm( userId ):
    gformNonVal = getNonValidatedCheckpoints(userId)
    gformVal = getValidatedCheckpoints(userId)
    gameVal = getUserCheckpoints(userId)
    
    #sorted, unique values in series1 that are not in series2
    #np.setdiff1d(series1.values, series2.values)
    
    #user has answered questions whose answer they haven't seen in the game
    gameNotEnough = pd.Series(np.setdiff1d(gformVal.values, gameVal.values))
    
    #user has not answered questions whose answer they have seen in the game
    gformNotEnough = []
    maxGameVal = ''
    if gameVal.values.size!=0:
        gameVal.values.max()
    for nonVal in gformNonVal.values:
        if nonVal >= maxGameVal:
            gformNotEnough.append(nonVal)    
    gformNotEnough = pd.Series(gformNotEnough)
    
    return (gameNotEnough, gformNotEnough)

In [ ]:
# Static data
noSectionEventCodes = list(['start', 'selectmenu', 'switch', 'restart',\
                            'gotourl', 'gotomooc', 'configure'])

In [ ]:
simpleEvents = [
    'complete',
    'configure',
    'craft',
    'death',
    'equip',
    'unequip',
    'add',
    'remove',
    'gotomooc',
    'gotourl',
    'pickup',
    'reach',
    'restart',
    'selectmenu',
    'start',
    'switch',
    ]

# possible events: complete	configure	craft	death	equip	gotomooc	gotourl	pickup	reach	restart	selectmenu	start	switch	unequip

userDataVectorIndex = [#game
                       'sessionsCount',
                       ]

for temporality in answerTemporalities:
    userDataVectorIndex.append(scoreLabel + temporality)

userDataVectorIndex = np.concatenate( (userDataVectorIndex,
                                     simpleEvents))

In [ ]:
#allEvents = rmdf152['type'].unique()
#allEvents = np.concatenate( simpleEvents, allEvents ).unique()
#allUserDataVectorIndex = np.concatenate( userDataVectorIndex, allEvents ).unique()

In [ ]:
# userId is RedMetrics user id
# _source is used as correction source, if we want to include answers to these questions
def getUserDataVector( userId, _source = correctAnswers, _rmDF = rmdf152 ):

    sessionsList = getUserSessions(_rmDF, userId)

    columnName = str(userId)

    data = pd.DataFrame(0, columns=[columnName],index=userDataVectorIndex)

    score = getScore( userId )
    for _temporality in score.columns:
        _score = score.loc[scoreLabel,_temporality]
        if(len(_score)>0):
            if(_temporality == 'before'):
                _score = _score[len(_score)-1]
            else:
                _score = _score[0]
        else:
            _score = np.nan
        data.loc[scoreLabel+_temporality,columnName] = _score

    data.loc['sessionsCount',columnName] = len(sessionsList)

    for eventName in simpleEvents:
        if eventName in craftEventCodes:
            data.loc[eventName,columnName] = getUserCraftEventsTotal(eventName, userId, sessionsList)
        else:
            data.loc[eventName,columnName] = getUserEventsTotal(eventName, userId, sessionsList)

    data.loc['maxChapter', columnName] = int(pd.Series(data = 'tutorial1.Checkpoint00')\
                                             .append(getUserCheckpoints(userId, _rmDF = _rmDF))\
                                             .max()[-2:])

    # time spent on each chapter
    times = getCheckpointsTimesUser(userId)

    completionTime = 0
    chapterTime = pd.Series()
    for chapter in timedSectionsIndex:
        deltaTime = times.loc[chapter,"firstCompletionDuration"].total_seconds()
        chapterTime.loc[int(chapter[-2:])] = deltaTime
        completionTime += deltaTime

    # efficiency = (1 + #unlockedchapters)/(time * (1 + #death + #craft + #add + #equip))
    data.loc['efficiency', columnName] = np.log(( 1 + data.loc['maxChapter', columnName] ) / \
                                        (completionTime \
                                         * ( 1\
                                            + data.loc['death', columnName] \
                                            + data.loc['craft', columnName]\
                                            + data.loc['add', columnName]\
                                            + data.loc['equip', columnName]\
                                           )\
                                        ))

    playedTime = getPlayedTimeUser(userId, _rmDF = _rmDF)
    
    data.loc['thoroughness', columnName] = \
    data.loc['craft', columnName]\
    * data.loc['pickup', columnName]\
    * ( 1 + np.power(len(playedTime['sandbox']['daysSpent']),2))

    totalSpentTime = playedTime['tutorial']['totalSpentTime'] + playedTime['sandbox']['totalSpentTime']
    totalSpentDays = len(playedTime['tutorial']['daysSpent'] | playedTime['sandbox']['daysSpent'])
    data.loc['fun', columnName] = np.log(\
                                    max(1,\
                                        totalSpentTime.total_seconds()
                                        * np.power(totalSpentDays,2)
                                       ))

    data.loc['completionTime', columnName] = completionTime
    for time in chapterTime.index:
        data.loc[time,columnName] = chapterTime.loc[time]

    if(len(_source) != 0):
        if(hasAnswered(userId)):
            gformLine = gform[gform[localplayerguidkey] == userId]
            afters = gformLine[gformLine['Temporality'] == 'after']
            if(len(afters) > 0):
                gformLine = afters.iloc[0]
            else:
                befores = gformLine[gformLine['Temporality'] == 'before']
                if(len(befores) > 0):
                    gformLine = befores.iloc[len(befores)-1]
                else:
                    gformLine = gformLine.iloc[len(gformLine)-1]

            # add data from the gform: binary score on each question
            gformData = getBinarized(gformLine, _source = _source)

            for question in gformData.index:
                data.loc[question,columnName] = gformData.loc[question]
        else:
            print("warning: user " + userId + " has never answered the survey")
        
    return data

In [ ]:
# for per-session, manual analysis
def getSessionDataPreview( _sessionId, _rmDF = rmdf152 ):
    _logs = _rmDF[_rmDF['sessionId'] == _sessionId]

    _timedEvents = _logs['userTime']
    _timedEvents = _timedEvents.sort_values()
    _platform = _logs['customData.platform'].dropna().values
    if(len(_platform) > 0):
        _platform = _platform[0]
    else:
        _platform = ''
    _events = _logs['type'].value_counts()
    return {
        'first' : _timedEvents.iloc[0],
        'last' : _timedEvents.iloc[-1],
        'platform' : _platform,
        'events' : _events
    }

In [ ]:
# for per-user, manual analysis
def getUserDataPreview(userId, _rmDF = rmdf152):
#    [ ] RM
#      [ ] sessions count
#      [ ] first event date
#      [ ] time played
#      [ ] dates played
#      [ ] first played, last played
#      [ ] best chapter
#      [ ] counts of events: deaths, crafts,...
#      [ ] gaming platform
#    [ ] GF
#      [ ] score(s)
#        [ ] progression
#      [ ] temporality
#        [ ] temporality according to answers
#        [ ] #before
#        [ ] #after
#      [ ] demographics

    result = pd.DataFrame(
        columns = [userId]
    )

    #    [ ] RM
    result.loc['REDMETRICS ANALYSIS'] = ' '
    #      [ ] sessions count
    sessions = getUserSessions(_rmDF, userId)
    result.loc['sessions', userId] = len(sessions)
    #      [ ] first event date
    result.loc['firstEvent', userId] = getFirstEventDate( userId )
    #      [ ] time played
    #      [ ] dates played
    #      [ ] first played, last played
    sessionIds = sessions['sessionId']
    for _sessionIdIndex in range(0, len(sessions['sessionId'])):
        _sessionId = sessionIds.iloc[_sessionIdIndex]
        sdp = getSessionDataPreview(_sessionId, _rmDF = _rmDF)

        result.loc['session' + str(_sessionIdIndex) + ' platform',userId] = sdp['platform']
        result.loc['session' + str(_sessionIdIndex) + ' first',userId] = sdp['first']
        result.loc['session' + str(_sessionIdIndex) + ' last',userId] = sdp['last']
        result.loc['session' + str(_sessionIdIndex) + ' events',userId] = str(sdp['events'])
    #      [ ] best chapter
    #      [ ] counts of events: deaths, crafts,...

    #    [ ] GF
    result.loc['GFORM ANALYSIS'] = ' '
    #      [ ] score(s)
    score = getScore( userId )
    for _temporality in score.columns:
        _score = score.loc[scoreLabel,_temporality]
        if(len(_score)>0):
            if(_temporality == 'before'):
                _score = _score[len(_score)-1]
            else:
                _score = _score[0]
        else:
            _score = np.nan
        result.loc[scoreLabel+_temporality,userId] = _score
    #        [ ] progression
    #      [ ] demographics
    result.loc[scoreLabel+'s',userId] = str(score.values)

    gfDataPreview = getGFormDataPreview(userId, gform)
    features = {1: 'date', 2: 'temporality RM', 3: 'temporality GF', 4: 'score', 5: 'genderAge'}
    for key in gfDataPreview:
        for featureKey in features:
            result.loc[key + ' ' + features[featureKey]] = str(gfDataPreview[key][features[featureKey]])
        index = 0
        for match in gfDataPreview[key]['demographic matches']:
            result.loc[key + ' demographic match ' + str(index)] = repr(match)
            index += 1

    return result